import plotly
print(plotly.__version__)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.impute import SimpleImputer
# Basic packages
import numpy as np, gc
from scipy import stats;
from scipy.stats import zscore, norm
from scipy.stats import randint as sp_randint
import matplotlib.style as style;
style.use('fivethirtyeight')
# Models
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, learning_curve
from wordcloud import WordCloud, STOPWORDS
# Display settings
pd.options.display.max_rows = 500
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
random_state = 42
np.random.seed(random_state)
import copy
import holidays
# Suppress warnings
import warnings; warnings.filterwarnings('ignore')
import random, re
import time
# used to supress display of warnings
import warnings
import missingno as mno
# nlp libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet');
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from wordcloud import WordCloud
import string
from collections import defaultdict
from collections import Counter
# import holoviews as hv
# from holoviews import opts
import os;
from os import makedirs
from gensim.models import Word2Vec
# sampling methods
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
# save models
import pickle
# pre-processing methods
from sklearn.model_selection import train_test_split
# the classification models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# ensemble models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
# methods and classes for evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
# cross-validation methods
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# feature selection methods
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_extraction.text import CountVectorizer
# pre-processing methods
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import LabelEncoder
# Deep learning libraries
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
from keras.layers import Input
from keras.layers.merge import Concatenate
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from keras.models import Model
from tensorflow.keras.layers import Flatten, Activation, Dense, LSTM, BatchNormalization, Embedding, Dropout, Flatten, Bidirectional, GlobalMaxPool1D
from keras.models import model_from_json
from keras.regularizers import l1, l2, l1_l2
from keras.constraints import maxnorm, min_max_norm
from keras.constraints import unit_norm
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasClassifier
# Keras pre-processing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
# Color Class to Bold and Color words while printing
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# function to create month variable into seasons
def month2seasons(x):
if x in [9, 10, 11]:
season = 'Spring'
elif x in [12, 1, 2]:
season = 'Summer'
elif x in [3, 4, 5]:
season = 'Autumn'
elif x in [6, 7, 8]:
season = 'Winter'
return season
# function to plot top Stopwords
def plot_top_stopwords_barchart(text):
stop=set(stopwords.words('english'))
new= text.str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]
dic=defaultdict(int)
for word in corpus:
if word in stop:
dic[word]+=1
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
x,y=zip(*top)
plt.bar(x,y)
#function to plot the top n-grams for a text
def plot_top_ngrams_barchart(text, n=2):
stop=set(stopwords.words('english'))
new= text.str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]
def _get_top_ngram(corpus, n=None):
vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:10]
top_n_bigrams=_get_top_ngram(text,n)[:10]
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)
#Helper functions to get cleaned text column
nltk.download('averaged_perceptron_tagger')
appos = {"ain't": "am not", "aren't": "are not", "can't": "cannot",
"can't've": "cannot have", "'cause": "because",
"could've": "could have", "couldn't": "could not",
"couldn't've": "could not have", "didn't": "did not",
"doesn't": "does not", "don't": "do not", "hadn't": "had not",
"hadn't've": "had not have", "hasn't": "has not",
"haven't": "have not", "he'd": "he would", "he'd've": "he would have",
"he'll": "he will", "he'll've": "he will have",
"he's": "he is", "how'd": "how did",
"how'd'y": "how do you", "how'll": "how will", "how's": "how is",
"I'd": "I would", "I'd've": "I would have", "I'll": "I will",
"I'll've": "I will have", "I'm": "I am", "I've": "I have",
"isn't": "is not", "it'd": "it would", "it'd've": "it would have",
"it'll": "it will", "it'll've": "it will have", "it's": "it is",
"let's": "let us", "ma'am": "madam", "mayn't": "may not",
"might've": "might have", "mightn't": "might not",
"mightn't've": "might not have", "must've": "must have",
"mustn't": "must not", "mustn't've": "must not have",
"needn't": "need not", "needn't've": "need not have",
"o'clock": "of the clock", "oughtn't": "ought not",
"oughtn't've": "ought not have", "shan't": "shall not",
"sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have",
"she'll": "she will", "she'll've": "she will have",
"she's": "she is", "should've": "should have",
"shouldn't": "should not", "shouldn't've": "should not have",
"so've": "so have", "so's": "so is",
"that'd": "that had", "that'd've": "that would have",
"that's": "that that is", "there'd": "there would",
"there'd've": "there would have", "there's": "there is",
"they'd": "they would", "they'd've": "they would have",
"they'll": "they will", "they'll've": "they will have",
"they're": "they are", "they've": "they have",
"to've": "to have", "wasn't": "was not", "we'd": "we would",
"we'd've": "we would have", "we'll": "we will",
"we'll've": "we will have", "we're": "we are",
"we've": "we have", "weren't": "were not",
"what'll": "what will", "what'll've": "what will have",
"what're": "what are", "what's": "what is",
"what've": "what have", "when's": "when is",
"when've": "when have", "where'd": "where did",
"where's": "where is", "where've": "where have",
"who'll": "who will", "who'll've": "who will have",
"who's": "who is", "who've": "who have",
"why's": "why is", "why've": "why have", "will've": "will have",
"won't": "will not", "won't've": "will not have",
"would've": "would have", "wouldn't": "would not",
"wouldn't've": "would not have", "y'all": "you all",
"y'all'd": "you all would", "y'all'd've": "you all would have",
"y'all're": "you all are", "y'all've": "you all have",
"you'd": "you would", "you'd've": "you would have",
"you'll": "you will", "you'll've": "you will have",
"you're": "you are", "you've": "you have"}
# Helper function to replace appos
def replace_words(description):
cleaned_description = []
for word in str(description).split():
if word.lower() in appos.keys():
cleaned_description.append(appos[word.lower()])
else:
cleaned_description.append(word)
return ' '.join(cleaned_description)
# Helper function to remove punctuations
# Reference: https://www.programiz.com/python-programming/methods/string/translate
PUNCT_TO_REMOVE = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' #string.punctuation
def remove_punctuation(text):
"""function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
# Helper function to lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
return ''.join([lemmatizer.lemmatize(word) for word in text])
# Helper function to remove stopwords
stoplist = set(stopwords.words('english'))
stoplist.update(('cm', 'kg', 'mr', 'wa' ,'nv', 'ore', 'da', 'pm', 'am', 'cx'))
stoplist.remove('not')
def remove_stopwords(text):
"""function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in stoplist])
# Helper function for wordcloud
# Reference: https://www.kaggle.com/aashita/word-clouds-of-various-shapes
def plot_wordcloud(text, mask = None, max_words = 500, max_font_size = 40,
figure_size = (12, 6), title = None, title_size = 15):
wordcloud = WordCloud(background_color = 'white', max_words = max_words,
random_state = 42, width = 350, height = 150,
mask = mask, stopwords = stoplist, collocations = False)
wordcloud.generate(str(text))
plt.figure(figsize = figure_size)
plt.imshow(wordcloud, interpolation = 'bilinear');
plt.title(title, fontdict = {'size': title_size, 'color': 'black',
'verticalalignment': 'bottom'})
plt.axis('off');
plt.tight_layout()
# Second helper function for lemmatizing
# lemmatizer = WordNetLemmatizer()
# def lem(text):
# pos_dict = {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}
# return(' '.join([lemmatizer.lemmatize(w,pos_dict.get(t, wn.NOUN)) for w,t in nltk.pos_tag(text.split())]))
#Functions to get cleaned text column
def get_cleaned_desc(df, col_to_clean, cleaned_col_name):
print(color.GREEN + 'Converting description to lower case' + color.END)
df[cleaned_col_name] = df[col_to_clean].apply(lambda x : x.lower())
print(color.GREEN + 'Replacing apostrophes to the standard lexicons' + color.END)
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x : replace_words(x))
print(color.GREEN + 'Removing punctuations' + color.END)
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: remove_punctuation(x))
print(color.GREEN + 'Applying Lemmatizer' + color.END)
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: lemmatize(x))
print(color.GREEN + 'Removing multiple spaces between words' + color.END)
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: re.sub(' +', ' ', x))
print(color.GREEN + 'Removing stop words' + color.END)
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: remove_stopwords(x))
return df
def get_cleaned_desc_wo_print(df, col_to_clean, cleaned_col_name):
df[cleaned_col_name] = df[col_to_clean].apply(lambda x : x.lower())
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x : replace_words(x))
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: remove_punctuation(x))
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: lemmatize(x))
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: re.sub(' +', ' ', x))
df[cleaned_col_name] = df[cleaned_col_name].apply(lambda x: remove_stopwords(x))
return df
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
words = str(s).lower()
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(embeddings_index[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
if type(v) != np.ndarray:
return np.zeros(300)
return v / np.sqrt((v ** 2).sum())
#Helper classes for Machine Learning, Neural Network and NLP models
def multiclass_logloss(actual, predicted, eps=1e-15):
"""Multi class version of Logarithmic Loss metric.
:param actual: Array containing the actual target classes
:param predicted: Matrix with class predictions, one probability per class
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2
clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota
def train_test_model(model, method, X_train, X_test, y_train, y_test, of_type, index, scale, report, save_model):
if report == "yes":
print (model)
print ("***************************************************************************")
if method == 'CatBoostClassifier' or method == 'LGBMClassifier':
model.fit(X_train, y_train) # Fit the model on Training set
else:
model.fit(X_train, y_train) # Fit the model on Training set
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score
if of_type == "coef":
# Intercept and Coefficients
print("The intercept for our model is {}".format(model.intercept_), "\n")
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, model.coef_.ravel()[idx]))
y_pred = model.predict(X_test) # Predict on Test set
# Initialise mc_logloss
mc_logloss = 1.00
if method != 'RidgeClassifier':
y_predictions = model.predict_proba(X_test)
train_accuracy_score = model.score(X_train, y_train)
test_accuracy_score = model.score(X_test, y_test)
precision_score = precision_score(y_test, y_pred, average='weighted')
recall_score = recall_score(y_test, y_pred, average='weighted')
f1_score = f1_score(y_test, y_pred, average='weighted')
if method != 'RidgeClassifier':
mc_logloss = multiclass_logloss(y_test, y_predictions, eps=1e-15)
if report == "yes":
# Model - Confusion matrix
model_cm = confusion_matrix(y_test, y_pred)
sns.heatmap(model_cm, annot=True, fmt='.2f', xticklabels = ["I", "II", "III", "IV", "V"] , yticklabels = ["I", "II", "III", "IV", "V"] )
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Model - Classification report
model_cr = classification_report(y_test, y_pred)
print(model_cr)
# Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method': method, 'Train Accuracy': train_accuracy_score, 'Test Accuracy': test_accuracy_score,
'Precision': precision_score, 'Recall': recall_score, 'F1-Score': f1_score,
'Multi-Class Logloss': mc_logloss}, index=[index])
# Save the model
if save_model == "yes":
filename = 'finalised_model.sav'
pickle.dump(model, open(filename, 'wb'))
return resultsDf # return all the metrics along with predictions
def train_test_allmodels(X_train_common, X_test_common, y_train, y_test, scale):
# define classification models
models=[['LogisticRegression',LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state = 1)],
['RidgeClassifier',RidgeClassifier(random_state = 1)],
['KNeighborsClassifier',KNeighborsClassifier(n_neighbors = 3)],
['SVC',SVC(kernel = 'rbf', probability=True)],
['DecisionTreeClassifier',DecisionTreeClassifier(criterion = 'gini', random_state=1)],
['RandomForestClassifier',RandomForestClassifier(n_estimators=10, random_state=1)],
['BaggingClassifier',BaggingClassifier(n_estimators=30, max_samples=0.75, random_state=1, oob_score=True)],
['ExtraTreesClassifier',ExtraTreesClassifier(n_estimators = 50, criterion='entropy', max_features='auto', min_samples_split=2,
bootstrap=True, oob_score=True)],
['AdaBoostClassifier',AdaBoostClassifier(n_estimators=100, learning_rate=0.25, random_state=1)],
['GradientBoostingClassifier',GradientBoostingClassifier(loss='deviance', n_estimators=50, learning_rate=0.1, validation_fraction=0.2,
random_state=1)],
['LGBMClassifier',LGBMClassifier(random_state=1, metric = "multi_logloss", objective="multiclass")],
['XGBClassifier',XGBClassifier(min_child_weight = 7, max_depth = 6, objective="multi:softmax", learning_rate = 0.1, gamma = 0.4,
colsample_bytree = 0.5)]
]
resultsDf_common = pd.DataFrame()
i = 1
for name, classifier in models:
# Train and Test the model
reg_resultsDf = train_test_model(classifier, name, X_train_common, X_test_common, y_train, y_test, 'none', i, scale, 'no', 'no')
# Store the accuracy results for each model in a dataframe for final comparison
resultsDf_common = pd.concat([resultsDf_common, reg_resultsDf])
i = i+1
return resultsDf_common
def hyperparameterstune_model(name, model, X_train, y_train, param_grid):
start = time.time() # note the start time
# Before starting with grid search we need to create a scoring function. This is accomplished using the make_scorer function of scikit-learn.
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)
# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
if name == 'LGBMClassifier':
grid_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, n_jobs=-1, cv=cv,
scoring = mll_scorer, error_score=0)
else:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring = mll_scorer, error_score=0)
model_grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best F1_Score: %f using %s" % (model_grid_result.best_score_, model_grid_result.best_params_))
means = model_grid_result.cv_results_['mean_test_score']
stds = model_grid_result.cv_results_['std_test_score']
params = model_grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
if param == model_grid_result.best_params_:
print("%f (%f) with: %r" % (mean, stdev, param))
print("95% Confidence interval range: ({0:.4f} %, {1:.4f} %)".format(mean-(2*stdev), mean+(2*stdev)))
end = time.time() # note the end time
duration = end - start # calculate the total duration
print("Total duration" , duration, "\n")
return model_grid_result.best_estimator_
# get the accuracy, precision, recall, f1 score from model
def get_classification_metrics(model, X_test, y_test, target_type):
# predict probabilities for test set
yhat_probs = model.predict(X_test, verbose=0) # Multiclass
# predict crisp classes for test set
if target_type == 'multi_class':
yhat_classes = model.predict_classes(X_test, verbose=0) # Multiclass
else:
yhat_classes = (np.asarray(model.predict(X_test))).round() # Multilabel
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, yhat_classes)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes, average='micro')
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes, average='micro')
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes, average='micro')
return accuracy, precision, recall, f1
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, validation_data=()):
super().__init__()
self.validation_data = validation_data
def on_train_begin(self, logs={}):
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
xVal, yVal, target_type = self.validation_data
if target_type == 'multi_class':
val_predict_classes = model.predict_classes(xVal, verbose=0) # Multiclass
else:
val_predict_classes = (np.asarray(self.model.predict(xVal))).round() # Multilabel
val_targ = yVal
_val_f1 = f1_score(val_targ, val_predict_classes, average='micro')
_val_recall = recall_score(val_targ, val_predict_classes, average='micro')
_val_precision = precision_score(val_targ, val_predict_classes, average='micro')
self.val_f1s.append(_val_f1)
self.val_recalls.append(_val_recall)
self.val_precisions.append(_val_precision)
#print("— train_f1: %f — train_precision: %f — train_recall %f" % (_val_f1, _val_precision, _val_recall))
return
# from google.colab import drive
# drive.mount('/content/gdrive')
#data = pd.read_csv('/content/gdrive/MyDrive/Capstone/Data Set - industrial_safety_and_health_database_with_accidents_description.csv', index_col=None, header=0)
data = pd.read_csv('industrial_safety_and_health_database_with_accidents_description.csv', index_col=None, header=0)
print("Number of rows = " + color.GREEN + str(data.shape[0]) + color.END + " and Number of Columns = " + color.GREEN + str(data.shape[1]) + color.END + " in the Data frame")
data.dtypes
data.head()
for col in data:
if str(col)!='Description':
print("\n" + color.BLUE + color.UNDERLINE + color.BOLD + str(col).upper() + color.END + "\n")
print(data[col].unique())
datadict = pd.DataFrame(data.dtypes)
datadict['MissingVal'] = data.isnull().sum()
datadict['NUnique']=data.nunique()
datadict
data.describe(include=['object'])
industry_df = copy.deepcopy(data)
industry_df.drop("Unnamed: 0", axis=1, inplace=True)
industry_df.rename(columns={'Data':'date',
'Countries':'country',
'Local':'local',
'Industry Sector':'industry_sector',
'Accident Level':'accident_level',
'Potential Accident Level':'potential_accident_level',
'Genre':'gender',
'Employee or Third Party':'emp_type',
'Critical Risk':'critical_risk',
'Description':'description'}
, inplace=True)
industry_df.head()
industry_df.duplicated().sum()
duplicates = industry_df.duplicated()
industry_df[duplicates]
industry_df.drop_duplicates(inplace=True)
industry_df.shape
Split 'Date' to Year, Month, Day etc.
industry_df['date'] = pd.to_datetime(industry_df['date'])
industry_df['year'] = industry_df.date.apply(lambda x : x.year)
industry_df['month'] = industry_df.date.apply(lambda x : x.month)
industry_df['day'] = industry_df.date.apply(lambda x : x.day)
industry_df['weekday'] = industry_df.date.apply(lambda x : x.day_name())
industry_df['week_of_year'] = industry_df.date.apply(lambda x : x.weekofyear)
industry_df.head()
Add 'Season' column based on months
industry_df['season'] = industry_df['month'].apply(month2seasons)
industry_df.head(3)
Mark Offical Holidays for Brazil in the 'is_holiday' column as '1'
brazil_holidays = []
print(color.GREEN + color.BOLD + 'List of Brazil holidays in 2016' + color.END)
for date in holidays.Brazil(years = 2016).items():
brazil_holidays.append(str(date[0]))
print(date)
print(color.GREEN + color.BOLD + 'List of Brazil holidays in 2017' + color.END)
for date in holidays.Brazil(years = 2017).items():
brazil_holidays.append(str(date[0]))
print(date)
industry_df['is_holiday'] = [1 if str(val).split()[0] in brazil_holidays else 0 for val in industry_df['date']]
industry_df.head(3)
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(20,5))
acc_level = "Accident Level"
data[acc_level].reset_index().groupby(acc_level).count().sort_values(by=
"index").plot(kind="barh", legend=False,
ax=ax[0]).grid(axis='x')
pot_acc_level = "Potential Accident Level"
data[pot_acc_level].reset_index().groupby(pot_acc_level).count().sort_values(by=
"index").plot(kind="barh", legend=False,
ax=ax[1]).grid(axis='x')
plt.show()
fig = px.pie(industry_df, names='country', template='seaborn')
fig.update_traces(rotation=90, pull=[0.2,0.03,0.1,0.03,0.1], textinfo="percent+label", showlegend=False)
fig.show()
fig = px.pie(industry_df, names='industry_sector', template='seaborn')
fig.update_traces(rotation=90, pull=[0.2,0.03,0.1,0.03,0.1], textinfo="percent+label", showlegend=False)
fig.show()
fig = px.pie(industry_df, names='emp_type', template='seaborn')
fig.update_traces(rotation=90, pull=[0.2,0.03,0.1,0.03,0.1], textinfo="percent+label", showlegend=False)
fig.show()
plt.figure(figsize=(20,5))
descending_order = industry_df['critical_risk'].value_counts().sort_values(ascending=False).index
sns.countplot(x=industry_df['critical_risk'],order=descending_order)
plt.xticks(rotation = 'vertical')
Critical Risk 'Others' has been reported more that 200 times
fig = px.pie(data, names='Local', template='seaborn')
fig.update_traces(rotation=90, pull=[0.2,0.03,0.1,0.03,0.1], textinfo="percent+label", showlegend=False)
fig.show()
Local 3 has reported more that 20% of the total accidents
gender_cnt = np.round(industry_df['gender'].value_counts(normalize=True) * 100)
gender_cnt.plot(kind='bar', figsize=(5,4), width=0.9, cmap='gray', title='Gender')
#plt.bar(gender_cnt).opts(title="Gender Count", color="#8888ff", xlabel="Gender", ylabel="Percentage", yformatter='%d%%')\
year_cnt = np.round(industry_df['year'].value_counts(normalize=True,sort=False) * 100)
year_cnt.plot(kind='bar', figsize=(5,4), width=0.9, cmap='YlGn', title='Year count')
# collecting year in list to work with it.
y_r = industry_df['date'].dt.year
y_16 = len(y_r[y_r==2016]) # checking if there is 2016 in list and getting total numbers of it.
y_17 = len(y_r[y_r==2017]) # checking if there is 2017 in list and getting total numbers of it.
x_axis = ["2016", "2017"]
y_axis = [y_16, y_17]
plt.bar(x_axis, y_axis, color = ['green', 'blue'])
plt.title("Accidents in 2016 and 2017")
plt.show()
The chart shows a declining trend in the number of accidents as 2016 and as 2017 .
month_cnt = np.round(industry_df['month'].value_counts(normalize=True,sort=False) * 100)
month_cnt.plot(kind='bar', figsize=(5,4), width=0.9, cmap='summer', title='Month Count')
This chart again shows a higher rate of accidents in the first 6 months of the year which can again be attributed to having twice the data as compared to the ast 6 months. However, it is important to note that month 2 (Feb) has a much higher count of accidents though the number of days (29) are less in that month.
day_cnt = np.round(industry_df['day'].value_counts(normalize=True,sort=False) * 100)
day_cnt.plot(kind='bar', figsize=(5,4), width=0.9, cmap='hsv', title='Day Count')
Here, it is seen that the days - 4, 8 and 16 have the most number of accidents reported.
weekday_cnt = pd.DataFrame(np.round(industry_df['weekday'].value_counts(normalize=True,sort=False) * 100))
weekday_cnt['week_num'] = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in weekday_cnt.index]
weekday_cnt.sort_values('week_num', inplace=True)
#hv.Bars((weekday_cnt.index, weekday_cnt.weekday)).opts(title="Weekday Count", color="#8888ff", xlabel="Weekdays") * hv.Curve(weekday_cnt['weekday']).opts(width=500, height=300, color='red', line_width=3
weekday_cnt['week_num'].plot(kind='bar', figsize=(5,4), width=0.9, cmap='pink', title='Weekday Count')
Tuesdays and Thurdays have a higher than average accident count
# Helper function for relation between Accident Level/Potential Accident levels and other labels
def target_count(df, col1):
fig = plt.figure(figsize = (15, 7.2))
ax = fig.add_subplot(121)
sns.countplot(x = col1, data = df, ax = ax, orient = 'v',
hue = 'accident_level').set_title(col1.capitalize() +' count plot by Accident Level',
fontsize = 13)
plt.legend(labels = df['accident_level'].unique())
plt.xticks(rotation = 90)
ax = fig.add_subplot(122)
sns.countplot(x = col1, data = df, ax = ax, orient = 'v',
hue = 'potential_accident_level').set_title(col1.capitalize() +' count plot by Potential Accident Level',
fontsize = 13)
plt.legend(labels = df['potential_accident_level'].unique())
plt.xticks(rotation = 90)
return plt.show()
target_count(industry_df, 'gender')
target_count(industry_df, 'emp_type')
target_count(industry_df, 'industry_sector')
target_count(industry_df, 'country')
target_count(industry_df, 'month')
target_count(industry_df, 'year')
target_count(industry_df, 'weekday')
sns.countplot(x="country", data=industry_df, hue="emp_type")
sns.countplot(x="country", data=industry_df, hue="industry_sector")
sns.countplot(x="country", data=industry_df,hue="gender")
Country_03 has no data for Female
sns.countplot(x="emp_type", data=industry_df,hue="gender")
f = lambda x : np.round(x/x.sum() * 100)
ac_gen = industry_df.groupby(['gender','accident_level'])['accident_level'].count().unstack().apply(f, axis=1)
#ac = hv.Bars(pd.melt(ac_gen.reset_index(), ['gender']), ['gender','accident_level'], 'value').opts(opts.Bars(title="Accident Level by Gender Count"))
pot_ac_gen = industry_df.groupby(['gender','potential_accident_level'])['potential_accident_level'].count().unstack().apply(f, axis=1)
#pot_ac = hv.Bars(pd.melt(pot_ac_gen.reset_index(), ['gender']), ['gender','potential_accident_level'], 'value').opts(opts.Bars(title="Potential Accident Level by Gender Count"))
ac_gen.plot(kind='bar', figsize=(15,4), width=0.9, cmap='cool', title='Accident Level by Gender Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pot_ac_gen.plot(kind='bar', figsize=(15,4), width=0.9, cmap='hot', title='Potential Accident Level by Gender Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#(ac + pot_ac).opts(opts.Bars(width=400, height=300,tools=['hover'],show_grid=True,xrotation=0, ylabel="Percentage", yformatter='%d%%'))
f = lambda x : np.round(x/x.sum() * 100)
ac_em = industry_df.groupby(['emp_type','accident_level'])['accident_level'].count().unstack().apply(f, axis=1)
#ac = hv.Bars(pd.melt(ac_em.reset_index(), ['emp_type']), ['emp_type','accident_level'], 'value').opts(opts.Bars(title="Accident Level by Employee type Count"))
pot_ac_em = industry_df.groupby(['emp_type','potential_accident_level'])['potential_accident_level'].count().unstack().apply(f, axis=1)
#pot_ac = hv.Bars(pd.melt(pot_ac_em.reset_index(), ['emp_type']), ['emp_type','potential_accident_level'], 'value').opts(opts.Bars(title="Potential Accident Level by Employee type Count"))
ac_em.plot(kind='bar', figsize=(15,4), width=0.9, cmap='cool', title='Accident Level by Employee type Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pot_ac_em.plot(kind='bar', figsize=(15,4), width=0.9, cmap='hot', title='Potential Accident Level by Employee type Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#(ac + pot_ac).opts(opts.Bars(width=400, height=300,tools=['hover'],show_grid=True,xrotation=0, ylabel="Percentage", yformatter='%d%%',fontsize={'title':9}))
f = lambda x : np.round(x/x.sum() * 100)
ac_mo = industry_df.groupby(['month','accident_level'])['accident_level'].count().unstack().apply(f, axis=1).fillna(0)
#ac = hv.Curve(ac_mo['I'], label='I') * hv.Curve(ac_mo['II'], label='II') * hv.Curve(ac_mo['III'], label='III') * hv.Curve(ac_mo['IV'], label='IV') * hv.Curve(ac_mo['V'], label='V')\
#.opts(opts.Curve(title="Accident Level by Month Count"))
pot_ac_mo = industry_df.groupby(['month','potential_accident_level'])['potential_accident_level'].count().unstack().apply(f, axis=1).fillna(0)
#pot_ac = hv.Curve(pot_ac_mo['I'], label='I') * hv.Curve(pot_ac_mo['II'], label='II') * hv.Curve(pot_ac_mo['III'], label='III') * hv.Curve(pot_ac_mo['IV'], label='IV')\
# * hv.Curve(pot_ac_mo['V'], label='V') * hv.Curve(pot_ac_mo['VI'], label='VI').opts(opts.Curve(title="Potential Accident Level by Month Count"))
ac_mo.plot(kind='bar', figsize=(15,4), width=0.9, cmap='cool', title='Accident Level by Month Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pot_ac_mo.plot(kind='bar', figsize=(15,4), width=0.9, cmap='hot', title='Potential Accident Level by Month Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#hv.extension('bokeh')
#(ac+pot_ac).opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Percentage", yformatter='%d%%')).cols(1)
f = lambda x : np.round(x/x.sum() * 100)
ac_weekday = industry_df.groupby(['weekday','accident_level'])['accident_level'].count().unstack().apply(f, axis=1).fillna(0)
ac_weekday['week_num'] = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in ac_weekday.index]
ac_weekday.sort_values('week_num', inplace=True)
ac_weekday.drop('week_num', axis=1, inplace=True)
#ac = hv.Curve(ac_weekday['I'], label='I') * hv.Curve(ac_weekday['II'], label='II') * hv.Curve(ac_weekday['III'], label='III') * hv.Curve(ac_weekday['IV'], label='IV') * hv.Curve(ac_weekday['V'], label='V')\
#.opts(opts.Curve(title="Accident Level by Weekday Count"))
pot_ac_weekday = industry_df.groupby(['weekday','potential_accident_level'])['potential_accident_level'].count().unstack().apply(f, axis=0).fillna(0)
pot_ac_weekday['week_num'] = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(i) for i in pot_ac_weekday.index]
pot_ac_weekday.sort_values('week_num', inplace=True)
pot_ac_weekday.drop('week_num', axis=1, inplace=True)
#pot_ac = hv.Curve(pot_ac_weekday['I'], label='I') * hv.Curve(pot_ac_weekday['II'], label='II') * hv.Curve(pot_ac_weekday['III'], label='III') * hv.Curve(pot_ac_weekday['IV'], label='IV')\
# * hv.Curve(pot_ac_weekday['V'], label='V') * hv.Curve(pot_ac_weekday['VI'], label='VI').opts(opts.Curve(title="Potential Accident Level by Weekday Count"))
ac_weekday.plot(kind='bar', figsize=(15,4), width=0.9, cmap='cool', title='Accident Level by Weekday Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pot_ac_weekday.plot(kind='bar', figsize=(15,4), width=0.9, cmap='hot', title='Potential Accident Level by Weekday Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#hv.extension('bokeh')
#(ac+pot_ac).opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Percentage", yformatter='%d%%')).cols(1)
f = lambda x : np.round(x/x.sum() * 100)
ac_season = industry_df.groupby(['season','accident_level'])['accident_level'].count().unstack().apply(f, axis=1).fillna(0)
ac_season['season_num'] = [['Spring', 'Summer', 'Autumn', 'Winter'].index(i) for i in ac_season.index]
ac_season.sort_values('season_num', inplace=True)
ac_season.drop('season_num', axis=1, inplace=True)
#ac = hv.Curve(ac_season['I'], label='I') * hv.Curve(ac_season['II'], label='II') * hv.Curve(ac_season['III'], label='III') * hv.Curve(ac_season['IV'], label='IV') * hv.Curve(ac_season['V'], label='V')\
#.opts(opts.Curve(title="Accident Level by Season Count"))
pot_ac_season = industry_df.groupby(['season','potential_accident_level'])['potential_accident_level'].count().unstack().apply(f, axis=0).fillna(0)
pot_ac_season['season_num'] = [['Spring', 'Summer', 'Autumn', 'Winter'].index(i) for i in pot_ac_season.index]
pot_ac_season.sort_values('season_num', inplace=True)
pot_ac_season.drop('season_num', axis=1, inplace=True)
#pot_ac = hv.Curve(pot_ac_season['I'], label='I') * hv.Curve(pot_ac_season['II'], label='II') * hv.Curve(pot_ac_season['III'], label='III') * hv.Curve(pot_ac_season['IV'], label='IV')\
#* hv.Curve(pot_ac_season['V'], label='V') * hv.Curve(pot_ac_season['VI'], label='VI').opts(opts.Curve(title="Potential Accident Level by Season Count"))
ac_season.plot(kind='bar', figsize=(15,4), width=0.9, cmap='cool', title='Accident Level by Season Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
pot_ac_season.plot(kind='bar', figsize=(15,4), width=0.9, cmap='hot', title='Potential Accident Level by Season Count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#hv.extension('bokeh')
#(ac+pot_ac).opts(opts.Curve(width=800, height=300,tools=['hover'],show_grid=True, ylabel="Percentage", yformatter='%d%%')).cols(1)
indexes = list(industry_df.loc[industry_df['description'].str.len() > 100, 'description'].index)
_ = industry_df.loc[indexes, 'accident_level'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['I', 'II', 'III', 'IV', 'V'], figsize = (10, 6))
_ = industry_df.loc[indexes, 'potential_accident_level'].value_counts().plot(kind = 'pie', autopct = '%.0f%%', labels = ['IV', 'III', 'II', 'I', 'V', 'VI'], figsize = (10, 6))
industry_df['description'].str.len().hist()
industry_df['description'].str.split().map(lambda x: len(x)).hist()
industry_df['description'].str.split().apply(lambda x : [len(i) for i in x]). \
map(lambda x: np.mean(x)).hist()
nltk.download("stopwords")
plot_top_stopwords_barchart(industry_df['description'])
def plot_top_non_stopwords_barchart(text):
stop=set(stopwords.words('english'))
new= text.str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]
counter=Counter(corpus)
most=counter.most_common()
x, y=[], []
for word,count in most[:40]:
if (word not in stop):
x.append(word)
y.append(count)
sns.barplot(x=y,y=x)
plot_top_non_stopwords_barchart(industry_df['description'])
def plot_top_ngrams_barchart(text, n=2):
stop=set(stopwords.words('english'))
new= text.str.split()
new=new.values.tolist()
corpus=[word for i in new for word in i]
def _get_top_ngram(corpus, n=None):
vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx])
for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:10]
top_n_bigrams=_get_top_ngram(text,n)[:10]
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)
plot_top_ngrams_barchart(industry_df['description'],2)
plot_top_ngrams_barchart(industry_df['description'],3)
plot_top_ngrams_barchart(industry_df['description'],4)
industry_df = get_cleaned_desc(industry_df, 'description', 'cleaned_description')
wordcloud = WordCloud(width = 1500, height = 800, random_state=0, background_color='black', colormap='rainbow', \
min_font_size=5, max_words=300, collocations=False).generate(" ".join(industry_df['cleaned_description'].values))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# Get length of each line
industry_df['line_length'] = industry_df['cleaned_description'].str.len()
print(color.GREEN + color.BOLD + 'Minimum line length: ' + color.END + str(industry_df['line_length'].min()))
print(color.GREEN + color.BOLD + 'Maximum line length: ' + color.END + str(industry_df['line_length'].max()))
print(color.GREEN + color.BOLD + 'Line with maximum length: ' + color.END + str(industry_df[industry_df['line_length'] == industry_df['line_length'].max()]['cleaned_description'].values[0]))
# Get length of each word
industry_df['nb_words'] = industry_df['cleaned_description'].apply(lambda x: len(x.split(' ')))
print(color.BLUE + color.BOLD + 'Minimum number of words: ' + color.END + str(industry_df['nb_words'].min()))
print(color.BLUE + color.BOLD + 'Maximum number of words: ' + color.END + str(industry_df['nb_words'].max()))
print(color.BLUE + color.BOLD + 'Line with maximum number of words: ' + color.END + str(industry_df[industry_df['nb_words'] == industry_df['nb_words'].max()]['cleaned_description'].values[0]))
print(color.GREEN + 'Five point summary for number of words' + color.END)
display(industry_df['nb_words'].describe().round(0).astype(int));
print(color.GREEN+ '99% quantilie: ' + color.END + str(industry_df['nb_words'].quantile(0.99)))
# define training data
sentences = industry_df['cleaned_description']
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(color.BLUE+ "Word2Vec Model: " + color.END + str(model))
# summarize vocabulary
words = list(model.wv.index_to_key)
print(color.BLUE+ "Words: " + color.END + str(words))
# save model
model.save('model.bin')
print(color.GREEN+ "Model Saved successfully" + color.END)
# load model
new_model = Word2Vec.load('model.bin')
print(color.BLUE+ "Model loaded from Disk: " + color.END + str(new_model))
embeddings_index = {}
EMBEDDING_FILE = 'glove.6B.200d.txt'
f = open(EMBEDDING_FILE, encoding="utf8")
for line in tqdm(f):
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print(color.GREEN + '\nWord vectors Found:' + color.END + str(len(embeddings_index)))
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
words = str(s).lower()
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(embeddings_index[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
if type(v) != np.ndarray:
return np.zeros(300)
return v / np.sqrt((v ** 2).sum())
# create sentence GLOVE embeddings vectors using the above function for training and validation set
industry_glove_df = [sent2vec(x) for x in tqdm(industry_df['cleaned_description'])]
industry_glove_df[0]
Get top 30 Terms as columns
industry_tfidf_df = pd.DataFrame()
for i in [1,2,3]:
vec_tfidf = TfidfVectorizer(max_features=10, norm='l2', stop_words='english', lowercase=True, use_idf=True, ngram_range=(i,i))
X = vec_tfidf.fit_transform(industry_df['cleaned_description']).toarray()
tfs = pd.DataFrame(X, columns=["tfidf_" + n for n in vec_tfidf.get_feature_names()])
industry_tfidf_df = pd.concat([industry_tfidf_df.reset_index(drop=True), tfs.reset_index(drop=True)], axis=1)
industry_tfidf_df.head(3)
# To replace white space everywhere in Employee type
industry_df['emp_type'] = industry_df['emp_type'].str.replace(' ', '_')
industry_df['emp_type'].value_counts()
# To replace white space everywhere in Critical Risk
industry_df['critical_risk'] = industry_df['critical_risk'].str.replace('\n', '').str.replace(' ', '_')
industry_df['critical_risk'].value_counts().head()
# Create Industry DataFrame
industry_featenc_df = industry_df[['year','month','day','week_of_year']].reset_index(drop=True)
# Label encoding
industry_df['season'] = industry_df['season'].replace('Summer', 'aSummer').replace('Autumn', 'bAutumn').replace('Winter', 'cWinter').replace('Spring', 'dSpring')
industry_featenc_df['season'] = LabelEncoder().fit_transform(industry_df['season']).astype(np.int8)
industry_df['weekday'] = industry_df['weekday'].replace('Monday', 'aMonday').replace('Tuesday', 'bTuesday').replace('Wednesday', 'cWednesday').replace('Thursday', 'dThursday').replace('Friday', 'eFriday').replace('Saturday', 'fSaturday').replace('Sunday', 'gSunday')
industry_featenc_df['weekday'] = LabelEncoder().fit_transform(industry_df['weekday']).astype(np.int8)
industry_featenc_df['country'] = LabelEncoder().fit_transform(industry_df['country']).astype(np.int8)
industry_featenc_df['local'] = LabelEncoder().fit_transform(industry_df['local']).astype(np.int8)
industry_featenc_df['gender'] = LabelEncoder().fit_transform(industry_df['gender']).astype(np.int8)
industry_featenc_df['industry_sector'] = LabelEncoder().fit_transform(industry_df['industry_sector']).astype(np.int8)
industry_featenc_df['emp_type'] = LabelEncoder().fit_transform(industry_df['emp_type']).astype(np.int8)
industry_featenc_df['critical_risk'] = LabelEncoder().fit_transform(industry_df['critical_risk']).astype(np.int8)
industry_featenc_df['accident_level'] = LabelEncoder().fit_transform(industry_df['accident_level']).astype(np.int8)
industry_featenc_df['potential_accident_level'] = LabelEncoder().fit_transform(industry_df['potential_accident_level'].replace('I', 'aI').replace('II', 'bII').replace('III', 'cIII').replace('IV', 'dIV').replace('V', 'eV').replace('VI', 'fVI')).astype(np.int8)
industry_featenc_df.head(3)
industry_df['accident_level'].unique()
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(industry_featenc_df['accident_level'])
dummy_y
np.any(np.isnan(industry_featenc_df))
industry_feat_df = industry_featenc_df.join(pd.DataFrame(industry_glove_df).iloc[:,0:30].reset_index(drop=True))
industry_feat_df = industry_featenc_df.join(industry_tfidf_df.reset_index(drop=True))
industry_feat_df.head()
Display old accident level counts
industry_feat_df['accident_level'].value_counts()
X = industry_feat_df.drop(['accident_level','potential_accident_level'], axis = 1) # Considering all Predictors
y = industry_feat_df['accident_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1, stratify = y)
X_train, X_test, y_train_dummy, y_test_dummy = train_test_split(X, dummy_y, test_size = 0.20, random_state = 1, stratify = y)
print(color.BLUE + 'X_train shape: ' + color.END + str((X_train.shape[0], X_train.shape[1])))
print(color.BLUE + 'y_train shape: ' + color.END + str(y_train.shape[0]))
print(color.BLUE + 'X_test shape: ' + color.END + str((X_test.shape[0], X_test.shape[1])))
print(color.BLUE + 'y_test shape: ' + color.END + str(y_test.shape[0]))
# Concatenate our training data back together
X_up = pd.concat([X_train, y_train], axis=1)
# Get the majority and minority class
pot_acc_level_0_majority = X_up[X_up['accident_level'] == 0]
pot_acc_level_1_minority = X_up[X_up['accident_level'] == 1]
pot_acc_level_2_minority = X_up[X_up['accident_level'] == 2]
pot_acc_level_3_minority = X_up[X_up['accident_level'] == 3]
pot_acc_level_4_minority = X_up[X_up['accident_level'] == 4]
# Upsample Level2 minority class
pot_acc_level_1_upsampled = resample(pot_acc_level_1_minority,
replace = True, # sample with replacement
n_samples = len(pot_acc_level_0_majority), # to match majority class
random_state = 1)
# Upsample Level3 minority class
pot_acc_level_2_upsampled = resample(pot_acc_level_2_minority,
replace = True, # sample with replacement
n_samples = len(pot_acc_level_0_majority), # to match majority class
random_state = 1)
# Upsample Level4 minority class
pot_acc_level_3_upsampled = resample(pot_acc_level_3_minority,
replace = True, # sample with replacement
n_samples = len(pot_acc_level_0_majority), # to match majority class
random_state = 1)
# Upsample Level5 minority class
pot_acc_level_4_upsampled = resample(pot_acc_level_4_minority,
replace = True, # sample with replacement
n_samples = len(pot_acc_level_0_majority), # to match majority class
random_state = 1)
industry_df_upsampled = pd.concat([pot_acc_level_0_majority, pot_acc_level_1_upsampled, pot_acc_level_2_upsampled, pot_acc_level_3_upsampled, pot_acc_level_4_upsampled])
industry_df_upsampled['accident_level'].value_counts()
# Separate input features and target
X_train_up = industry_df_upsampled.drop(['accident_level'], axis = 1) # Considering all Predictors
y_train_up = industry_df_upsampled['accident_level']
sm = SMOTE(random_state=1)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
df_smote = pd.concat([pd.DataFrame(X_train_smote), pd.DataFrame(y_train_smote)], axis=1)
df_smote.head()
columns_list=[]
for col in industry_df_upsampled.columns:
columns_list.append(col)
df_smote.columns = columns_list
df_smote.head()
# Separate input features and target
X_train_smote = df_smote.iloc[:,:-1] # Considering all Predictors
y_train_smote = df_smote.iloc[:,-1:]
X_train_smote.head(1)
y_train_smote['accident_level'].value_counts()
# convert integers to dummy variables (i.e. one hot encoded)
y_train_smote_dummy = np_utils.to_categorical(y_train_smote['accident_level'])
y_train_smote_dummy
# Transform independent features
scaler_X = StandardScaler()#StandardScaler()
pipeline = Pipeline(steps=[('s', scaler_X)])
X_train.iloc[:,:12] = pipeline.fit_transform(X_train.iloc[:,:12]) # Scaling only first 12 feautres
X_test.iloc[:,:12] = pipeline.fit_transform(X_test.iloc[:,:12]) # Scaling only first 12 feautres
X_train_up.iloc[:,:12] = pipeline.fit_transform(X_train_up.iloc[:,:12]) # Scaling only first 12 feautres
X_train_smote.iloc[:,:12] = pipeline.fit_transform(X_train_smote.iloc[:,:12]) # Scaling only first 12 feautres
X_train.head(3)
# Transform independent features
scaler_X = StandardScaler()#StandardScaler()
pipeline = Pipeline(steps=[('s', scaler_X)])
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(X_train.T) # the relevanat covariance matrix
print(color.GREEN + 'Covariance Matrix: \n' + color.END + str(cov_matrix))
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print(color.GREEN + '\nEigenvectors: \n' + color.END + str(e_vecs))
print(color.GREEN + '\nEigenvalues \n' + color.END + str(e_vals))
# the "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print(color.GREEN + "Cumulative Variance Explained" + color.END + str(cum_var_exp))
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(20 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
Capturing 90% variance of the data
pca = PCA(n_components = 0.90)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)
print(X_train_reduced.shape)
print(X_test_reduced.shape)
# DummyClassifier to predict all Accident levels
dummy = DummyClassifier(strategy='stratified').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
# checking unique labels
print(color.GREEN + 'Unique predicted labels: ' + color.END + str(np.unique(dummy_pred)))
# checking accuracy
print(color.GREEN + 'Test score: ' + color.END + str(accuracy_score(y_test, dummy_pred)))
yhat_probs = dummy.predict(X_test)
yhat_classes = (np.asarray(dummy.predict(X_test))).round()
accuracy = accuracy_score(y_test, yhat_classes)
precision = precision_score(y_test, yhat_classes, average='micro')
recall = recall_score(y_test, yhat_classes, average='micro')
f1 = f1_score(y_test, yhat_classes, average='micro')
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
train_test_allmodels(X_train, X_test, y_train, y_test, 'no')
train_test_allmodels(X_train_up, X_test, y_train_up, y_test, 'no')
train_test_allmodels(X_train_smote, X_test, y_train_smote, y_test, 'no')
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
tf.random.set_seed(seed)
# define the model
model = Sequential()
model.add(Dense(50, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(150, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(40, activation='relu', kernel_initializer='he_uniform'))
model.add(Dense(1, activation='linear'))
# compile the keras model
#opt = optimizers.Adam(lr=1e-3)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])
# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=0.001)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
# fit the keras model on the dataset
training_history = model.fit(X_train, y_train, epochs=100, batch_size=8, verbose=1, validation_data=(X_test, y_test), callbacks=[rlrp])
model.summary()
# evaluate the keras model
_, train_accuracy = model.evaluate(X_train, y_train, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_test, y_test, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' +color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, X_test, y_test, '')
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot (epochs, training_history.history['loss'], label = 'train')
plt.plot (epochs, training_history.history['val_loss'], label = 'val')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['accuracy'], label = 'train')
plt.plot(epochs, training_history.history['val_accuracy'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
random_state = 42
#param = 1e-9
param = 1e-4
# define the model
model = Sequential()
model.add(Dense(10, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm()))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(10, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(5, activation='softmax', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())) # Multilabel
# compile the keras model
#opt = optimizers.Adamax(lr=0.01)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])
# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=2, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=(X_train, y_train_dummy, target_type))
# fit the keras model on the dataset
training_history = model.fit(X_train, y_train_dummy, epochs=30, batch_size=8, verbose=1, validation_data=(X_test, y_test_dummy), callbacks=[rlrp, metrics])
model.summary()
# evaluate the keras model
_, train_accuracy = model.evaluate(X_train, y_train_dummy, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_test, y_test_dummy, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' +color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, X_test, y_test_dummy, target_type)
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['categorical_accuracy'], label = 'train')
plt.plot(epochs, training_history.history['val_categorical_accuracy'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print(color.GREEN + "Saved model weights to disk" + color.END)
# Save the model in h5 format
model.save("finalized_keras_model.h5")
print(color.GREEN + "Saved model to disk" + color.END)
# fix random seed for reproducibility
random_state = 42
#param = 1e-9
param = 1e-4
# define the model
model = Sequential()
model.add(Dense(10, input_dim=X_train_smote.shape[1], activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm()))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(10, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(5, activation='softmax', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())) # Multilabel
# compile the keras model
#opt = optimizers.Adamax(lr=0.01)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])
# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=(X_train_smote, y_train_smote_dummy, target_type))
# fit the keras model on the dataset
training_history = model.fit(X_train_smote, y_train_smote_dummy, epochs=100, batch_size=8, verbose=1, validation_data=(X_test, y_test_dummy), callbacks=[rlrp, metrics])
model.summary()
# evaluate the keras model
_, train_accuracy = model.evaluate(X_train_smote, y_train_smote_dummy, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_test, y_test_dummy, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' + color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, X_test, y_test_dummy, target_type)
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['categorical_accuracy'], label = 'train')
plt.plot(epochs, training_history.history['val_categorical_accuracy'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print(color.GREEN + "Saved model weights to disk" + color.END)
# Save the model in h5 format
model.save("finalized_keras_model.h5")
print(color.GREEN + "Saved model to disk" + color.END)
# Select input and output features
industry_df['input_desc'] = industry_df['cleaned_description'] + " " +industry_df['critical_risk']
industry_df['input_desc'] = industry_df['input_desc'].str.replace('\d+', '')
X_text = industry_df['input_desc']
y_text = industry_df['accident_level']
y_text = LabelEncoder().fit_transform(y_text)
# Divide our data into testing and training sets:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size = 0.20, random_state = 1, stratify = y_text)
print(color.GREEN + 'X_text_train shape : ' + color.END + str(X_text_train.shape[0]))
print(color.GREEN + 'y_text_train shape : ' + color.END + str(y_text_train.shape[0]))
print(color.GREEN + 'X_text_test shape : ' + color.END + str(X_text_test.shape[0]))
print(color.GREEN + 'y_text_test shape : ' + color.END + str(y_text_test.shape[0]))
# Convert both the training and test labels into one-hot encoded vectors:
y_text_train = np_utils.to_categorical(y_text_train)
y_text_test = np_utils.to_categorical(y_text_test)
# The first step in word embeddings is to convert the words into their corresponding numeric indexes.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_text_train)
X_text_train = tokenizer.texts_to_sequences(X_text_train)
X_text_test = tokenizer.texts_to_sequences(X_text_test)
# Sentences can have different lengths, and therefore the sequences returned by the Tokenizer class also consist of variable lengths.
# We need to pad the our sequences using the max length.
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)
maxlen = 100
X_text_train = pad_sequences(X_text_train, padding='post', maxlen=maxlen)
X_text_test = pad_sequences(X_text_test, padding='post', maxlen=maxlen)
# We need to load the built-in GloVe word embeddings
embedding_size = 200
embeddings_dictionary = dict()
glove_file = open('glove.6B.200d.txt', encoding="utf8")
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = np.asarray(records[1:], dtype='float32')
embeddings_dictionary[word] = vector_dimensions
glove_file.close()
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
len(embeddings_dictionary.values())
random_state = 42
# Build a LSTM Neural Network
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(max_pool_layer_1)
dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
dense_layer_2 = Dense(64, activation = 'relu')(drop_out_layer_2)
drop_out_layer_3 = Dropout(0.5, input_shape = (64,))(dense_layer_2)
dense_layer_3 = Dense(32, activation = 'relu')(drop_out_layer_3)
drop_out_layer_4 = Dropout(0.5, input_shape = (32,))(dense_layer_3)
dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.5, input_shape = (10,))(dense_layer_4)
dense_layer_5 = Dense(5, activation='softmax')(drop_out_layer_5)
model = Model(inputs=deep_inputs, outputs=dense_layer_5)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
print(model.summary())
# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=(X_text_train, y_text_train, target_type))
# fit the keras model on the dataset
training_history = model.fit(X_text_train, y_text_train, epochs=100, batch_size=8, verbose=1, validation_data=(X_text_test, y_text_test), callbacks=[rlrp, metrics])
# evaluate the keras model
_, train_accuracy = model.evaluate(X_text_train, y_text_train, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_text_test, y_text_test, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' + color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, X_text_test, y_text_test, target_type)
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score:' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['acc'], label = 'train')
plt.plot(epochs, training_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model weights to disk")
# Save the model in h5 format
model.save("finalized_keras_model_v0.h5")
print("Saved model to disk")
# Select input and output features
X_cat = industry_featenc_df.drop(['accident_level','potential_accident_level'], axis = 1)
y_cat = industry_df['accident_level']
y_cat = LabelEncoder().fit_transform(y_cat)
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y_cat, test_size = 0.20, random_state = 1, stratify = y_cat)
print(color.GREEN + 'X_cat_train shape : ' + color.END + str(X_cat_train.shape[0]))
print(color.GREEN + 'y_cat_train shape : ' + color.END + str(y_cat_train.shape[0]))
print(color.GREEN + 'X_cat_test shape : ' + color.END + str(X_cat_test.shape[0]))
print(color.GREEN + 'y_cat_test shape : ' + color.END + str(y_cat_test.shape[0]))
y_cat_train = np_utils.to_categorical(y_cat_train)
y_cat_test = np_utils.to_categorical(y_cat_test)
# Variable transformation using StandardScaler
scaler_X = StandardScaler()#StandardScaler()
X_cat_train.iloc[:,:12] = scaler_X.fit_transform(X_cat_train.iloc[:,:12]) # Scaling only first 6 feautres
X_cat_test.iloc[:,:12] = scaler_X.fit_transform(X_cat_test.iloc[:,:12]) # Scaling only first 6 feautres
param = 1e-4
input2 = Input(shape=(X_cat_train.shape[1],))
dense_layer_1 = Dense(10, input_dim=X_cat_train.shape[1], activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())(input2)
drop_out_layer_1 = Dropout(0.2)(dense_layer_1)
batch_norm_layer_1 = BatchNormalization()(drop_out_layer_1)
dense_layer_2 = Dense(10, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())(batch_norm_layer_1)
drop_out_layer_2 = Dropout(0.5)(dense_layer_2)
batch_norm_layer_2 = BatchNormalization()(drop_out_layer_2)
dense_layer_3 = Dense(5, activation='softmax', kernel_regularizer=l2(param), kernel_constraint=unit_norm())(batch_norm_layer_2)
model = Model(inputs=input2, outputs=dense_layer_3)
# compile the keras model
#opt = optimizers.Adamax(lr=0.01)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
print(model.summary())

# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=(X_cat_train, y_cat_train, target_type))
# fit the keras model on the dataset
training_history = model.fit(X_cat_train, y_cat_train, epochs=100, batch_size=8, verbose=1, validation_data=(X_cat_test, y_cat_test), callbacks=[rlrp, metrics])
# evaluate the keras model
_, train_accuracy = model.evaluate(X_cat_train, y_cat_train, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_cat_test, y_cat_test, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' + color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, X_cat_test, y_cat_test, target_type)
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision:' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['acc'], label = 'train')
plt.plot(epochs, training_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model weights to disk")
# Save the model in h5 format
model.save("finalized_keras_model_v1.h5")
print("Saved model to disk")
input_1 = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, embedding_size, weights=[embedding_matrix], trainable=False)(input_1)
LSTM_Layer_1 = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(max_pool_layer_1)
dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
dense_layer_2 = Dense(64, activation = 'relu')(drop_out_layer_2)
drop_out_layer_3 = Dropout(0.5, input_shape = (64,))(dense_layer_2)
dense_layer_3 = Dense(32, activation = 'relu')(drop_out_layer_3)
drop_out_layer_4 = Dropout(0.5, input_shape = (32,))(dense_layer_3)
dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.5, input_shape = (10,))(dense_layer_4)
#-------------------------------------------------------------------------------
param = 1e-4
input_2 = Input(shape=(X_cat_train.shape[1],))
dense_layer_5 = Dense(10, input_dim=X_cat_train.shape[1], activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())(input_2)
drop_out_layer_6 = Dropout(0.2)(dense_layer_5)
batch_norm_layer_1 = BatchNormalization()(drop_out_layer_6)
dense_layer_6 = Dense(10, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(param),
kernel_constraint=unit_norm())(batch_norm_layer_1)
drop_out_layer_7 = Dropout(0.5)(dense_layer_6)
batch_norm_layer_2 = BatchNormalization()(drop_out_layer_7)
concat_layer = Concatenate()([drop_out_layer_5, batch_norm_layer_2])
dense_layer_7 = Dense(10, activation='relu')(concat_layer)
output = Dense(5, activation='softmax')(dense_layer_7)
model = Model(inputs=[input_1, input_2], outputs=output)
# compile the keras model
#opt = optimizers.Adamax(lr=0.01)
opt = SGD(lr=0.001, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
print(model.summary())

# Use earlystopping
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=7, min_delta=1E-3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5, min_delta=1E-4)
target_type = 'multi_label'
metrics = Metrics(validation_data=([X_text_train, X_cat_train], y_cat_train, target_type))
# fit the keras model on the dataset
training_history = model.fit([X_text_train, X_cat_train], y_cat_train, epochs=100, batch_size=8, verbose=1, validation_data=([X_text_test, X_cat_test], y_cat_test), callbacks=[rlrp, metrics])
# evaluate the keras model
_, train_accuracy = model.evaluate([X_text_train, X_cat_train], y_cat_train, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate([X_text_test, X_cat_test], y_cat_test, batch_size=8, verbose=0)
print(color.GREEN + 'Train accuracy: ' + color.END + str(train_accuracy*100))
print(color.GREEN + 'Test accuracy: ' + color.END + str(test_accuracy*100))
accuracy, precision, recall, f1 = get_classification_metrics(model, [X_text_test, X_cat_test], y_cat_test, target_type)
print(color.BLUE + 'Accuracy: ' + color.END + str(accuracy))
print(color.BLUE + 'Precision: ' + color.END + str(precision))
print(color.BLUE + 'Recall: ' + color.END + str(recall))
print(color.BLUE + 'F1 score: ' + color.END + str(f1))
epochs = range(len(training_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, training_history.history['loss'], label = 'train')
plt.plot(epochs, training_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
# plot accuracy learning curves
plt.plot(epochs, training_history.history['acc'], label = 'train')
plt.plot(epochs, training_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
from io import StringIO
def chatbot_response(msg):
op=""
final_model = load_model("finalized_keras_model_v0.h5")
s="desc"
s = s+"\n"+msg
StringData = StringIO(s)
# let's read the data using the Pandas
# read_csv() function
df = pd.read_csv(StringData, sep =";")
df_op = get_cleaned_desc_wo_print(df, 'desc', 'cleaned_desc')
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_op)
df_op = tokenizer.texts_to_sequences(df_op)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100
df_op = pad_sequences(df_op, padding='post', maxlen=maxlen)
seq_predictions=final_model.predict(df_op[0])
seq_predictions=seq_predictions[0] # transformation to get (n,)
seq_predictions = list(map(lambda x: 0 if x<0.5 else 1, seq_predictions))
max_index = seq_predictions.index(max(seq_predictions)) + 1
if max_index==1:
op = 'The accident Level is Minor'
elif max_index==2:
op = 'The accident Level is Moderate'
elif max_index==3:
op = 'The accident Level is Severe'
elif max_index==4:
op = 'The accident Level is Critical'
elif max_index==5:
op = 'The accident Level is Fatal'
return op
#Creating GUI with tkinter
import tkinter
from tkinter import *
def send():
msg = EntryBox.get("1.0",'end-1c').strip()
EntryBox.delete("0.0",END)
if (msg.lower() == 'hi') or (msg.lower() == 'hello'):
ChatLog.config(state=NORMAL)
ChatLog.insert(END, "You: " + msg + '\n\n')
ChatLog.config(foreground="#442265", font=("Verdana", 12 ))
ChatLog.insert(END, "Bot: Hello! Kindly put in the description of the accident" + '\n\n')
ChatLog.config(state=DISABLED)
ChatLog.yview(END)
elif msg != '':
ChatLog.config(state=NORMAL)
ChatLog.insert(END, "You: " + msg + '\n\n')
ChatLog.config(foreground="#442265", font=("Verdana", 12 ))
res = chatbot_response(msg)
ChatLog.insert(END, "Bot: " + res + '\n\n')
ChatLog.config(state=DISABLED)
ChatLog.yview(END)
base = Tk()
base.title("Accident Level Prediction Bot")
base.geometry("400x500")
base.resizable(width=FALSE, height=FALSE)
#Create Chat window
ChatLog = Text(base, bd=0, bg="white", height="8", width="50", font="Arial",)
ChatLog.config(state=DISABLED)
#Bind scrollbar to Chat window
scrollbar = Scrollbar(base, command=ChatLog.yview, cursor="heart")
ChatLog['yscrollcommand'] = scrollbar.set
#Create Button to send message
SendButton = Button(base, font=("Verdana",12,'bold'), text="Send", width="12", height=5,
bd=0, bg="#32de97", activebackground="#3c9d9b",fg='#ffffff',
command= send )
#Create the box to enter message
EntryBox = Text(base, bd=0, bg="white",width="29", height="5", font="Arial")
#EntryBox.bind("<Return>", send)
#Place all components on the screen
scrollbar.place(x=376,y=6, height=386)
ChatLog.place(x=6,y=6, height=386, width=370)
EntryBox.place(x=128, y=401, height=90, width=265)
SendButton.place(x=6, y=401, height=90)
base.mainloop()
